import numpy as np
def compute_depth(order_book_side):
"""Compute total depth (total quantity available) on one side of the book."""
return sum(q for _, q in order_book_side)
def compute_slope(order_book_side):
"""Compute the slope of the bid or ask curve."""
if len(order_book_side) < 2:
return None # Not enough data to compute slope
p1, q1 = order_book_side[0]
p2, _ = order_book_side[1]
return (p1 - p2) / q1 if q1 > 0 else None
def compute_quantity_weighted_price(order_book_side):
"""Compute the quantity-weighted price for bid or ask."""
total_quantity = sum(q for _, q in order_book_side)
if total_quantity == 0:
return None
weighted_price = sum(p * q for p, q in order_book_side) / total_quantity
return weighted_price
def compute_quantity_weighted_mid_quote(bid_side, ask_side):
"""Compute the quantity-weighted mid-quote."""
wp_bid = compute_quantity_weighted_price(bid_side)
wp_ask = compute_quantity_weighted_price(ask_side)
if wp_bid is None or wp_ask is None:
return None
return (wp_bid + wp_ask) / 2
def compute_quantity_weighted_bid_ask_spread(bid_side, ask_side):
"""Compute the quantity-weighted bid-ask spread."""
wp_bid = compute_quantity_weighted_price(bid_side)
wp_ask = compute_quantity_weighted_price(ask_side)
if wp_bid is None or wp_ask is None:
return None
return wp_ask - wp_bid
def compute_mid_quote_difference(mid_quote, bid_side, ask_side):
"""Compute the difference between mid-quote and quantity-weighted mid-quote."""
wmid = compute_quantity_weighted_mid_quote(bid_side, ask_side)
if wmid is None:
return None
return mid_quote - wmid
def get_order_book(timestamp, df):
"""Extracts bid and ask order book lists sorted by price"""
df_time = df[df['Timestamp'] == timestamp]
# Extract and sort bid side (highest price first)
bid_side = df_time[df_time['Side'] == 'bid'][['Price', 'Size']].sort_values(by='Price', ascending=False)
bid_side = list(bid_side.itertuples(index=False, name=None)) # Convert to list of tuples
# Extract and sort ask side (lowest price first)
ask_side = df_time[df_time['Side'] == 'ask'][['Price', 'Size']].sort_values(by='Price', ascending=True)
ask_side = list(ask_side.itertuples(index=False, name=None)) # Convert to list of tuples
return bid_side, ask_side
import pandas as pd
def compute_orderbook_changes(orderbook_df):
"""
Computes the changes in the order book at each timestamp sequentially.
Parameters:
- orderbook_df: DataFrame with ['Price', 'Size', 'Side', 'Timestamp'].
Returns:
- DataFrame showing changes in order book per timestamp, including previous size.
"""
# Convert timestamp to datetime if not already
orderbook_df['Timestamp'] = pd.to_datetime(orderbook_df['Timestamp'])
# Sort data by timestamp and price for consistent comparison
orderbook_df = orderbook_df.sort_values(by=["Timestamp", "Price"]).reset_index(drop=True).drop_duplicates()
# List to store changes
changes = []
# Unique timestamps sorted
timestamps = orderbook_df['Timestamp'].unique()
for i in range(len(timestamps) - 1):
t1, t2 = timestamps[i], timestamps[i + 1]
# Order books at two consecutive timestamps
ob_t1 = orderbook_df[orderbook_df['Timestamp'] == t1].set_index(['Price', 'Side'])['Size']
ob_t2 = orderbook_df[orderbook_df['Timestamp'] == t2].set_index(['Price', 'Side'])['Size']
# Compute changes
size_changes = ob_t2.subtract(ob_t1, fill_value=0)
# Store only nonzero changes along with previous size
for (price, side), change in size_changes.items():
if change != 0:
prev_size = ob_t1.get((price, side), 0) # Get previous size, default to 0 if not present
new_size = ob_t2.get((price, side), 0) # Get new size
changes.append([t2, price, side, prev_size, new_size, change])
# Convert to DataFrame
changes_df = pd.DataFrame(changes, columns=['Timestamp', 'Price', 'Side', 'Prev_Size', 'New_Size', 'Size_Change'])
return changes_df
The total depth on one side of the order book is the sum of all available quantities:
$ \text{Depth} = \sum_{i} q_i $
where $q_i$ is the quantity available at each price level $i$.
The slope of the bid or ask curve measures how quickly the price changes with respect to quantity:
$ \text{Slope}_{\text{bid}} = \frac{p_{\text{bid},1} - p_{\text{bid},2}}{q_{\text{bid},1}} $
where:
The ask slope is defined analogously.
The quantity-weighted price for bids or asks is:
$ \text{WP}_{\text{side}} = \frac{\sum_{i} p_{\text{side},i} \cdot q_{\text{side},i}}{\sum_{i} q_{\text{side},i}} $
where:
The quantity-weighted mid-quote is the average of the quantity-weighted bid and ask prices:
$ \text{WMid} = \frac{\text{WP}_{\text{bid}} + \text{WP}_{\text{ask}}}{2} $
where:
The quantity-weighted bid-ask spread is given by:
$ \text{WSpread} = \text{WP}_{\text{ask}} - \text{WP}_{\text{bid}} $
which represents the difference between the quantity-weighted ask and bid prices.
The difference between the regular mid-quote and the quantity-weighted mid-quote is:
$ \text{Mid-Quote Difference} = \text{MidQuote} - \text{WMid} $
where $\text{MidQuote}$ is the traditional mid-point between the best bid and ask prices.
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.animation as animation
import json
import warnings
warnings.simplefilter("ignore")
# Open and load the JSON file
with open("matchup_details.json", "r") as file:
matchups = json.load(file) # Parse JSON into a Python dictionary
matchups_list = [(i.split(' ')[0], i.split(' ')[2]) for i in list(matchups.keys())]
# Dictionary to store where each team appears in the filtered list
team_indices = {}
filtered_matchups = []
for i, (team1, team2) in enumerate(matchups_list):
if team1 in team_indices or team2 in team_indices:
old_index = team_indices.get(team1, team_indices.get(team2))
if old_index is not None:
filtered_matchups.pop(old_index)
for key in team_indices:
if team_indices[key] > old_index:
team_indices[key] -= 1
team_indices[team1] = len(filtered_matchups)
team_indices[team2] = len(filtered_matchups)
filtered_matchups.append((team1, team2))
id_filtered = [matchups[" vs. ".join(i)]['outcomes'] for i in filtered_matchups if " vs. ".join(i) in matchups]
keys = list(id_filtered[2].keys())
# comparison = df.sort_values(['Timestamp', 'Price']).drop_duplicates()
keys = list(id_filtered[3].keys())
gamelist = [list(i.keys()) for i in id_filtered]
# gamelist = [list(id_filtered[i].keys()) for i in [2,3,4,6,7]]
for keys in gamelist[2:]:
team_data_changes = {}
for team in keys:
df = pd.read_parquet(team + '.parquet')
bid_df = df[df.Side == 'bid']
ask_df = df[df.Side == 'ask']
bid_changes = compute_orderbook_changes(bid_df)
ask_changes = compute_orderbook_changes(ask_df)
team_data_changes[team] = pd.concat([bid_changes, ask_changes])
fig, axes = plt.subplots(1, 2, figsize=(30, 8), sharex=True, sharey=True, facecolor=(1,1,1))
for i, team in enumerate(keys[:2]): # Only plot top two
col = i
bid_data_changes = team_data_changes[team][team_data_changes[team].Side == 'bid']
ask_data_changes = team_data_changes[team][team_data_changes[team].Side == 'ask']
scatter_bid = axes[col].scatter(
bid_data_changes["Timestamp"], bid_data_changes["Price"],
c=bid_data_changes["Size_Change"], cmap="coolwarm", s=200,
edgecolors="k", marker='s', label="Bid Changes"
)
scatter_ask = axes[col].scatter(
ask_data_changes["Timestamp"], ask_data_changes["Price"],
c=ask_data_changes["Size_Change"], cmap="coolwarm", s=200,
edgecolors="k", marker='o', label="Ask Changes"
)
timestamps = sorted(team_data_changes[team]['Timestamp'].unique())
bid_depths, ask_depths, mid_quotes, bid_ask_spreads = [], [], [], []
for timestamp in timestamps:
bid_side, ask_side = get_order_book(timestamp, pd.read_parquet(f'{team}.parquet').drop_duplicates())
bid_depths.append(compute_depth(bid_side))
ask_depths.append(compute_depth(ask_side))
mid_quotes.append(compute_quantity_weighted_mid_quote(bid_side, ask_side))
bid_ask_spreads.append(compute_quantity_weighted_bid_ask_spread(bid_side, ask_side))
timestamps_num = [t.timestamp() for t in timestamps]
axes[col].plot(timestamps, mid_quotes, label="Quantity-Weighted Mid-Quote", marker='o', color='black', markersize=10)
axes[col].plot(timestamps, bid_ask_spreads, label="Quantity-Weighted Bid-Ask Spread", marker='s', color='red', markersize=12)
axes[col].set_xlabel("Timestamp", fontsize=25)
axes[col].set_ylabel("Price Level", fontsize=25)
axes[col].set_title(f"{team} - Bid & Ask Changes", fontsize=35)
axes[col].tick_params(axis="x", rotation=45)
axes[col].grid(True)
handles, labels = axes[0].get_legend_handles_labels()
fig.legend(handles, labels, loc='lower center', ncol=10, fontsize=20)
plt.suptitle(f'Order Book Visualization for {team}', fontsize=40)
plt.tight_layout(rect=[0, 0.1, 1, 1])
plt.show()
To test the hypothesis that order book liquidity is unevenly distributed and market depth affects bid-ask spread fluctuations, we can analyze the following:
Market Depth:
Spread Volatility vs. Liquidity:
Impact of Market Orders:
Order Flow Imbalance:
Latency Effects:
A Bayesian state-space model can estimate the true latent liquidity profile and spread dynamics.
Use Markov Chain Monte Carlo (MCMC) or Variational Inference to estimate the posterior distribution of $L_t$ and $S_t$.
# Modified Simulation: Liquidity Shocks and Nonlinear Spread Response
# Parameters
T = 100 # Time steps
sigma_L = 1.5 # Volatility of liquidity
sigma_S = 0.9 # Volatility of spread
sigma_O = 2.0 # Increased observation noise for spread
sigma_V = 2.0 # Increased observation noise for liquidity
alpha = 2.5 # Stronger influence of liquidity on spread
# Initialize arrays
L_true = np.zeros(T) # True liquidity
S_true = np.zeros(T) # True spread
O_obs = np.zeros(T) # Observed spread
V_obs = np.zeros(T) # Observed volume
# Initial values
L_true[0] = np.random.normal(5, sigma_L) # Starting liquidity
S_true[0] = np.random.normal(2, sigma_S) # Starting spread
# Simulate state-space dynamics with liquidity crashes
for t in range(1, T):
# Introduce liquidity shocks randomly
if np.random.rand() < 0.1: # 10% probability of a liquidity crash
L_true[t] = L_true[t-1] * 0.2 # Liquidity drops sharply
else:
L_true[t] = L_true[t-1] + np.random.normal(0, sigma_L)
# Spread reacts nonlinearly to liquidity
S_true[t] = S_true[t-1] + alpha / (L_true[t] + 0.1) + np.random.normal(0, sigma_S)
# Observations with added noise
O_obs[t] = S_true[t] + np.random.normal(0, sigma_O)
V_obs[t] = L_true[t] + np.random.normal(0, sigma_V)
# Plot results
fig, ax = plt.subplots(2, 1, figsize=(15, 7), sharex=True)
ax[0].plot(L_true, label=r'True Liquidity $(L_t)$', linestyle="--", color="blue", lw= 4)
ax[0].plot(V_obs, label=r'Observed Volume $(V_t)$', linestyle="dotted", color="red", lw = 4)
ax[0].set_ylabel("Liquidity / Volume", fontsize = 15)
ax[0].legend(fontsize = 15)
ax[0].set_title("Liquidity and Observed Volume", fontsize = 25)
ax[1].plot(S_true, label="True Spread (S_t)", linestyle="--", color="green", lw = 3)
ax[1].plot(O_obs, label="Observed Spread (O_t)", linestyle="dotted", color="orange", lw = 3)
ax[1].set_ylabel("Bid-Ask Spread", fontsize = 15)
ax[1].set_xlabel("Time Step", fontsize = 15)
ax[1].legend(fontsize = 15)
ax[1].set_title("Bid-Ask Spread and Observed Spread", fontsize = 25)
plt.tight_layout()
plt.show()
First Plot (Liquidity & Observed Volume): This shows the true liquidity $ L_t $ (blue dashed) and the observed volume $ V_t $ (red dotted). The key takeaway is that liquidity can crash suddenly, and observed volume is a noisy proxy that does not perfectly track the true liquidity state. When liquidity drops, market conditions worsen, leading to higher spreads.
Second Plot (Bid-Ask Spread & Observed Spread): This illustrates the true spread $ S_t $ (green dashed) and the observed spread $ O_t $ (orange dotted). The key insight is that spreads widen dramatically when liquidity crashes, showing a nonlinear relationship. The observed spread contains noise, making direct market signals unreliable without proper filtering.
Liquidity Drives Spread Formation: The model shows that when market liquidity $ L_t $ drops, the bid-ask spread $ S_t $ widens nonlinearly. This captures how thin order books lead to higher trading costs, making it critical for traders to monitor liquidity depth.
Predicting Liquidity Crashes: By modeling hidden liquidity as a state-space variable, we can estimate when market makers might pull liquidity. This helps traders anticipate spread widening events and adjust their strategies accordingly.
Noise Filtering for Execution: The observed volume $ V_t $ and spread $ O_t $ contain noise, making raw order book data misleading. Using a Bayesian approach, we can infer true liquidity conditions, which improves execution timing for large trades.
Exploiting Market Inefficiencies: Traders can profit from transient liquidity shocks by identifying temporary spread dislocations. When the model detects a liquidity-induced spread spike, market makers can provide liquidity at wider spreads for higher returns.
A trader can exploit this using:
Spread Mean Reversion:
Liquidity Detection & Execution:
Sniping & Latency Arbitrage:
Identifying Stop Hunts & Fake Liquidity: